Overview

> basicConfig()
> project_summary = "/Users/rory/cache/felipe-rnaseq/april-round/2016-05-04_fabio-april-merged/project-summary.csv"
> counts_file = "/Users/rory/cache/felipe-rnaseq/april-round/2016-05-04_fabio-april-merged/combined.counts"
> tx2genes_file = "/Users/rory/cache/felipe-rnaseq/april-round/2016-05-04_fabio-april-merged/tx2gene.csv"
> cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", 
+     "#D55E00", "#CC79A7")
> summarydata = read.table(project_summary, header = TRUE, sep = ",")
> summarydata = summarydata[, colSums(is.na(summarydata)) < nrow(summarydata)]
> # handle newer bcbio-nextgen runs that use description as the key
> if ("description" %in% colnames(summarydata)) {
+     rownames(summarydata) = summarydata$description
+     summarydata$Name = rownames(summarydata)
+     summarydata$description = NULL
+ } else {
+     rownames(summarydata) = summarydata$Name
+     summarydata$Name = NULL
+ }
> summarydata = summarydata[order(rownames(summarydata)), ]
> if (file.exists(tx2genes_file)) {
+     sample_dirs = file.path("..", "..", rownames(summarydata))
+     salmon_files = file.path(sample_dirs, "salmon", "quant.sf")
+     sailfish_files = file.path(sample_dirs, "sailfish", "quant.sf")
+     new_sailfish = file.path(sample_dirs, "sailfish", "quant", "quant.sf")
+     new_salmon = file.path(sample_dirs, "salmon", "quant", "quant.sf")
+     if (file.exists(salmon_files[1])) {
+         loginfo("Using gene counts calculated from the Salmon transcript counts.")
+         sf_files = salmon_files
+     } else if (file.exists(sailfish_files[1])) {
+         loginfo("Using gene counts calculated from the Sailfish transcript counts.")
+         sf_files = sailfish_files
+     } else if (file.exists(new_sailfish[1])) {
+         loginfo("Using gene counts calculated from the Sailfish transcript counts.")
+         sf_files = new_sailfish
+     } else if (file.exists(new_salmon[1])) {
+         loginfo("Using gene counts calculated from the Salmon transcript counts.")
+         sf_files = new_salmon
+     }
+     names(sf_files) = rownames(summarydata)
+     tx2gene = read.table(tx2genes_file, sep = ",", row.names = NULL, header = FALSE)
+     txi.salmon = tximport(sf_files, type = "salmon", tx2gene = tx2gene, reader = readr::read_tsv, 
+         countsFromAbundance = "lengthScaledTPM")
+     counts = round(data.frame(txi.salmon$counts, check.names = FALSE))
+ } else {
+     loginfo("Using gene counts calculated from featureCounts.")
+     counts = read.table(counts_file, header = TRUE, row.names = "id", check.names = FALSE)
+ }
2016-05-15 20:34:15 INFO::Using gene counts calculated from the Sailfish transcript counts.
> counts = counts[, order(colnames(counts)), drop = FALSE]
> colnames(counts) = gsub(".counts", "", colnames(counts))
> 
> # this is a list of all non user-supplied metadata columns that could appear
> known_columns = c("Name", "X.GC", "Exonic.Rate", "Sequences.flagged.as.poor.quality", 
+     "rRNA_rate", "Fragment.Length.Mean", "Intronic.Rate", "Intergenic.Rate", 
+     "Mapping.Rate", "Quality.format", "Duplication.Rate.of.Mapped", "Mapped", 
+     "rRNA", "Sequence.length", "Transcripts.Detected", "Mean.Per.Base.Cov.", 
+     "Genes.Detected", "Unique.Starts.Per.Read", "unique_starts_per_read", "complexity", 
+     "X5.3.bias", "Duplicates.pct", "Duplicates", "Mapped.reads", "Average.insert.size", 
+     "Mapped.reads.pct", "Total.reads", "avg_coverage_per_region", "Mapped.Reads")
> summarydata[, "Fragment.Length.Mean"] = summarydata$Average.insert.size
> metadata = summarydata[, !colnames(summarydata) %in% known_columns, drop = FALSE]
> metadata = metadata[, colSums(is.na(metadata)) < nrow(metadata), drop = FALSE]
> sanitize_datatable = function(df, ...) {
+     # remove dashes which cause wrapping
+     DT::datatable(df, ..., rownames = gsub("-", "_", rownames(df)), colnames = gsub("-", 
+         "_", colnames(df)))
+ }
> # set seed for reproducibility
> set.seed(1454944673)

Sample metadata

> get_heatmap_fn = function(summarydata) {
+     # return the pheatmap function with or without metadata
+     if (ncol(metadata) == 0) {
+         return(pheatmap)
+     } else {
+         # rownames(metadata) = summarydata$Name
+         heatmap_fn = function(data, ...) {
+             pheatmap(data, annotation = metadata, ...)
+         }
+         return(heatmap_fn)
+     }
+ }
> heatmap_fn = get_heatmap_fn(summarydata)

Quality control metrics

> qualimap_run = "Mapped" %in% colnames(summarydata)
> do_quality = "Mapped.reads" %in% colnames(summarydata)

Mapped reads

It looks like HC-2_9 and MS-2_5 failed, the mapped reads are very low.

> ggplot(summarydata, aes(x = Name, y = Mapped)) + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")

> ggplot(summarydata, aes(x = Name, y = Mapped.reads)) + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")

Genomic mapping rate

HC-2_9 definitely failed, the genomic mapping rate is near zero. MS-2_5 looks like it just had much less reads than the other samples.

> ggplot(summarydata, aes(x = Name, y = Mapping.Rate)) + geom_bar(stat = "identity") + 
+     ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90))

> ggplot(summarydata, aes(x = Name, y = Mapped.reads.pct)) + geom_bar(stat = "identity") + 
+     ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90))

Number of genes detected

Despite MS-2_5 looking okay in terms of the mapping rate, the genes detected is low. We’ll probably have to drop using this sample.

> dd = data.frame(Name = colnames(counts), Genes.Detected = colSums(counts > 0))
> ggplot(dd, aes(x = Name, y = Genes.Detected)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("genes detected") + 
+     xlab("")

Gene detection saturation

Barring the two troubled samples, these libraries look great.

> dd = data.frame(Mapped = summarydata$Mapped, Genes.Detected = colSums(counts > 
+     0))
> ggplot(dd, aes(x = Mapped, y = Genes.Detected)) + geom_point() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     ylab("genes detected") + xlab("reads mapped")

Exonic mapping rate

Exonic mapping rate looks great, most of the reads that align align to annotated exons. Excellent.

> ggplot(summarydata, aes(x = Name, y = Exonic.Rate)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("exonic mapping rate") + 
+     xlab("")

rRNA mapping rate

We can see the failed sample has a really high rRNA mapping rate. There is still a lot of rRNA in these samples, not sure why the rRNA removal isn’t working very well.

> eval_rRNA = "rRNA_rate" %in% colnames(summarydata) & !sum(is.na(summarydata$rRNA_rate)) == 
+     nrow(summarydata)
> ggplot(summarydata, aes(x = Name, y = rRNA_rate)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("rRNA rate") + 
+     xlab("")

Estimated fragment length of paired-end reads

Fragment lengths look great this time.

> ggplot(summarydata, aes(x = Name, y = Fragment.Length.Mean)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("fragment length") + 
+     xlab("")

5’->3’ bias

We can see some samples have a 5’->3’ bias, this usually indicates some sort of degradation in these samples. If we had RIN numbers for the total RNA, we could confirm this, do we have these?

> ggplot(summarydata, aes(x = Name, y = X5.3.bias)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("5'->3' bias") + 
+     xlab("")

Drop outliers

It’s pretty clear we have to drop HC-2_9 and MS-2_5. We’ll drop these from further consideration now before moving forward.

> drop = c("HC-2_9", "MS-2_5")
> summarydata = subset(summarydata, !Name %in% drop)
> counts = counts[, summarydata$Name]

Boxplot of log10 counts per gene

> melted = melt(counts)
> colnames(melted) = c("sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     xlab("")

Boxplot of log10 TMM-normalized counts per gene

Trimmed mean of M-values (TMM) normalization is described here

Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biology, 11(3). doi:10.1186/gb-2010-11-3-r25

> y = DGEList(counts = counts)
> y = calcNormFactors(y)
> normalized_counts = cpm(y, normalized.lib.sizes = TRUE)
> melted = melt(normalized_counts)
> colnames(melted) = c("gene", "sample", "count")
> melted$sample = factor(melted$sample)
> melted = melted[order(melted$sample), ]
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     xlab("")

Density of log10 TMM-normalized counts

> ggplot(melted, aes(x = count, group = sample)) + geom_density() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     xlab("")

Correlation (Pearson) heatmap of TMM-normalized counts

> heatmap_fn(cor(normalized_counts, method = "pearson"))

Correlation (Spearman) heatmap of TMM-normalized counts

> heatmap_fn(cor(normalized_counts, method = "spearman"))

PCA plots

> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = ~Name)
> vst = varianceStabilizingTransformation(dds)
> pca_loadings = function(object, ntop = 500) {
+     rv <- matrixStats::rowVars(assay(object))
+     select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
+     pca <- prcomp(t(assay(object)[select, ]))
+     percentVar <- pca$sdev^2/sum(pca$sdev^2)
+     names(percentVar) = colnames(pca$x)
+     pca$percentVar = percentVar
+     return(pca)
+ }
> pc = pca_loadings(vst)
> comps = data.frame(pc$x)
> comps$Name = rownames(comps)
> library(dplyr)
> comps = comps %>% left_join(summarydata, by = c(Name = "Name"))
> colorby = "Name"
> pca_plot = function(comps, nc1, nc2, colorby) {
+     c1str = paste0("PC", nc1)
+     c2str = paste0("PC", nc2)
+     ggplot(comps, aes_string(c1str, c2str, color = colorby)) + geom_point() + 
+         theme_bw() + xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100), 
+         "% variance")) + ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] * 
+         100), "% variance"))
+ }

PC1 vs. PC2

We can see the first component is dominated by the cells being plated or not; it isn’t clear how useful having these plated cells are, they are very different from all of the other samples. If they are unplated and also untreated, we cannot tell if the differences are just due to not being treated or being plated.

We can also see the LPS and LPS+IL27 cells don’t seem to cluster together very well either, not do the MS vs HC cells. With so few samples and so much noise it will be hard to pull out a reliable signal.

> pca_plot(comps, 1, 2, "condition")

> pca_plot(comps, 1, 2, "treatment")

> pca_plot(comps, 1, 2, "patient")

PC3 vs. PC4

The HC and MS samples separate nicely along the third and fourth components, which gives some hope that we can pull out a HC vs MS signal. However the IL27 and LPS treatments do not reliably separate so it is unlikely we are going to see a strong signal with these comparisons.

> pca_plot(comps, 3, 4, "condition")

> pca_plot(comps, 3, 4, "treatment")

> pca_plot(comps, 3, 4, "patient")

PC5 vs. PC6

Components 5 and 6 don’t separate out the samples very well at all, so most of the information is captured by the first 4 components.

> pca_plot(comps, 5, 6, "condition")

> pca_plot(comps, 5, 6, "treatment")

> pca_plot(comps, 5, 6, "patient")

Variance explained by component

> ggplot(data.frame(component = reorder(names(pc$percentVar), -pc$percentVar), 
+     percent_var = pc$percentVar), aes(component, percent_var)) + geom_bar(stat = "identity") + 
+     ylab("percent of total variation") + xlab("") + theme_bw()

> # snagged from development version of DESeq
> DESeqDataSetFromTximport <- function(txi, colData, design, ...) {
+     counts <- round(txi$counts)
+     mode(counts) <- "integer"
+     dds <- DESeqDataSetFromMatrix(countData = counts, colData = colData, design = design, 
+         ...)
+     stopifnot(txi$countsFromAbundance %in% c("no", "scaledTPM", "lengthScaledTPM"))
+     if (txi$countsFromAbundance %in% c("scaledTPM", "lengthScaledTPM")) {
+         message("using length scaled TPM counts from tximport")
+     } else {
+         message("using counts and average transcript lengths from tximport")
+         lengths <- txi$length
+         dimnames(lengths) <- dimnames(dds)
+         assays(dds)[["avgTxLength"]] <- lengths
+     }
+     return(dds)
+ }
> 
> subset_tximport = function(txi, rows, columns) {
+     txi$counts = txi$counts[rows, columns]
+     txi$abundance = txi$abundance[rows, columns]
+     txi$length = txi$length[rows, columns]
+     return(txi)
+ }

Differential expression

We will drop the unplated samples because they prevent us from looking at interactons between the treatment and disease state since we don’t have treatment for the unplated samples.

These experiments aren’t really set up to be able to answer questions like what is the effect of LPS and LPS+IL27 treatment on the MS and HC cells, because we don’t have any untreated cells from both groups to compare.

We can fit these models in a couple of different ways to try to look at some of the differences. The first thing we will do is to compare the MS LPS and HC LPS treated cells to each other and the MS LPS+IL27 and HC LPS+IL27 treated cells to each other. This gives us two lists of genes that are what is different between those cells. This will capture the overall differences between HC and MS cells and also the interaction effect of each treatment on the cells themselves, so if the HC and MS cells start off at different places, we will capture that.

> library(DEGreport)
> library(vsn)
> summarydata = subset(summarydata, treatment != "unplated")
> summarydata$cond_treat = paste(summarydata$condition, summarydata$treatment, 
+     sep = "_")
> design = ~cond_treat
> counts = counts[, rownames(summarydata)]
> counts <- counts[rowSums(counts > 0) > 1, ]
> if (exists("txi.salmon")) {
+     loginfo("Using Sailfish gene counts for the DESeq2 model.")
+     txi.salmon = subset_tximport(txi.salmon, rownames(counts), colnames(counts))
+     dds = DESeqDataSetFromTximport(txi.salmon, colData = summarydata, design = design)
+ } else {
+     loginfo("Using counts from featureCounts for the DESeq2 model.")
+     dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, 
+         design = design)
+ }
> geoMeans = apply(counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 
+     0]))))
> dds = estimateSizeFactors(dds, geoMeans = geoMeans)
> dds = DESeq(dds)
> aprildds = dds
> save(aprildds, file = "april_dds.RData")
> library(biomaRt)
> mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")
> conversions = getBM(attributes = c("ensembl_gene_id", "hgnc_symbol"), mart = mart)

Dispersion estimates

We see a large amount of biological variation which is due to these being human samples.

> plotDispEsts(dds)

LPS on MS vs HC

This compares the LPS treated MS samples to the LPS treated HC samples. This will capture the effect that LPS treatment has on the HC cells vs the MS cells, but it will also capture innate differences between the HC cells and MS cells.

> ms_vs_hc_lps = results(dds, contrast = list("cond_treatMS_LPS", "cond_treatHC_LPS"))
> plotMA(ms_vs_hc_lps)

> ms_vs_hc_lps = data.frame(ms_vs_hc_lps)
> volcano_density_plot(ms_vs_hc_lps[, c(2, 6)])

> ms_vs_hc_lps$ensembl_gene_id = rownames(ms_vs_hc_lps)
> ms_vs_hc_lps = ms_vs_hc_lps %>% left_join(conversions, by = "ensembl_gene_id")
> write.table(ms_vs_hc_lps, file = "ms_vs_hc_lps.csv", row.names = FALSE, col.names = TRUE, 
+     quote = FALSE, sep = ",")

We find 84 genes different comparing the LPS treated MS cells to the LPS treated MS cells.

LPS+IL27 on MS vs HC

This compares the LPS+IL27 treated MS samples to the LPS+IL27 treated HC samples. This will capture the effect that LPS+IL27 treatment has on the HC cells vs the MS cells, but it will also capture innate differences between the HC cells and MS cells. It also does not capture effects specific to IL27 treatment, it detects the combination of LPS treatment, IL27 treatment and innate MS vs HC cell differences along with any interactions between the treatments and the cell types.

> ms_vs_hc_il27 = results(dds, contrast = list("cond_treatMS_LPS.IL27", "cond_treatHC_LPS.IL27"))
> plotMA(ms_vs_hc_il27)

> ms_vs_hc_il27 = data.frame(ms_vs_hc_il27)
> volcano_density_plot(ms_vs_hc_il27[, c(2, 6)])

> ms_vs_hc_il27$ensembl_gene_id = rownames(ms_vs_hc_il27)
> ms_vs_hc_il27 = ms_vs_hc_il27 %>% left_join(conversions, by = "ensembl_gene_id")
> write.table(ms_vs_hc_il27, file = "ms_vs_hc_il27.csv", row.names = FALSE, col.names = TRUE, 
+     quote = FALSE, sep = ",")

We find 73 genes different comparing the IL27+LPS treated MS cells to the IL27+LPS treated MS cells.

LPS vs LPS+IL27 on MS and HC

Felipe asked for this over email:

The most important question that we want to figure it out (the primary objective):
1) How do the cells change after adding IL27? So, first comparison and independently:
HC: LPS vs. LPS+IL27
MS: LPS vs. LPS+IL27

We can do that pretty easily using what we set up above and instead of comparing across the cell types, we can compare across the treatments within a cell type.

LPS vs LPS+IL27 in MS

This is the effect of treating the MS cells with LPS+IL27 compared to treatment with just LPS. We don’t detect any differences. The LPS and LPS+IL27 treatments do not cluster together very well on the PCA plot in the QC section and we only have a very small number of samples. This means there is a lot of variation with treatments and so we would need a large number of samples to detect if there is an effect of IL27 treatment.

> il27_ms = results(dds, contrast = list("cond_treatMS_LPS.IL27", "cond_treatMS_LPS"))
> plotMA(il27_ms)

> il27_ms = data.frame(il27_ms)
> volcano_density_plot(il27_ms[, c(2, 6)])

> il27_ms$ensembl_gene_id = rownames(il27_ms)
> il27_ms = il27_ms %>% left_join(conversions, by = "ensembl_gene_id")
> write.table(il27_ms, file = "il27_ms.csv", row.names = FALSE, col.names = TRUE, 
+     quote = FALSE, sep = ",")

We find 2 genes different comparing the IL27+LPS treated MS cells to the LPS treated MS cells.

LPS vs LPS+IL27 in HC

This is the effect of treating the HC cells with LPS+IL27 compared to treatment with just LPS. We don’t detect any differences. The LPS and LPS+IL27 treatments do not cluster together very well on the PCA plot in the QC section and we only have a very small number of samples. This means there is a lot of variation with treatments and so we would need a large number of samples to detect if there is an effect of IL27 treatment.

> il27_hc = results(dds, contrast = list("cond_treatHC_LPS.IL27", "cond_treatHC_LPS"))
> plotMA(il27_hc)

> il27_hc = data.frame(il27_hc)
> volcano_density_plot(il27_hc[, c(2, 6)])

> il27_hc$ensembl_gene_id = rownames(il27_hc)
> il27_hc = il27_hc %>% left_join(conversions, by = "ensembl_gene_id")
> write.table(il27_hc, file = "il27_hc.csv", row.names = FALSE, col.names = TRUE, 
+     quote = FALSE, sep = ",")

We find 0 genes different comparing the IL27+LPS treated MS cells to the LPS treated HC cells.

Interaction fitting

We can also fit an interaction term which should less us separate out the specific effect treating the MS cells with LPS+IL27.

> design = ~condition + treatment + condition:treatment
> dds = DESeqDataSetFromTximport(txi.salmon, colData = summarydata, design = design)
> geoMeans = apply(counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 
+     0]))))
> dds = estimateSizeFactors(dds, geoMeans = geoMeans)
> dds = DESeq(dds)

Specific effect of LPS+IL27 treatment on MS cells

This is the specific effect of LPS+IL27 treatment on the MS cells– this has a little more power because we are breaking up the variance between the samples and assigning some of it to the cell type and treatment which is why we see a few genes we didn’t by looking at HC and MS separately.

> ms_il27_specific = results(dds, name = "conditionMS.treatmentLPS.IL27")
> plotMA(ms_il27_specific)

> ms_il27_specific = data.frame(ms_il27_specific)
> volcano_density_plot(ms_il27_specific[, c(2, 6)])

> ms_il27_specific$ensembl_gene_id = rownames(ms_il27_specific)
> ms_il27_specific = ms_il27_specific %>% left_join(conversions, by = "ensembl_gene_id")
> write.table(ms_il27_specific, file = "ms_il27_specific.csv", row.names = FALSE, 
+     col.names = TRUE, quote = FALSE, sep = ",")

We find 5 genes different in the MS cells specific to the treatment with LPS+IL27.

Summary

These libraries look the best out of everything we’ve looked at so far. Even so at least one sample failed and one other sample looks pretty bad.

In terms an effect of adding IL27, we aren’t seeing too much different between the samples. When we’re looking for subtle effects with patient samples, just a couple of samples isn’t enough data points to have enough power to find differences because variation is too large to pull out anything significant. If we had untreated controls for the MS samples we would likely be able to see the effects of LPS treatment, since that effect seems large.